# === Data Understanding ===
## ==== Step 0: Load Necessary Libraries ====
cat("\n==== Step 0: Load Necessary Libraries ====\n")
##
## ==== Step 0: Load Necessary Libraries ====
# List of all required libraries
required_libraries <- c(
"tidyr", # Data tidying
"ggplot2", # Visualization
"dplyr", # Data manipulation
"caret", # Machine learning and feature selection
"randomForest", # Random Forest implementation
"scales", # Scaling and formatting in plots
"reshape2", # Data reshaping
"glue", # String interpolation
"moments", # Skewness and kurtosis calculations
"data.table", # Efficient data handling
"RColorBrewer", # Color palettes
"patchwork" # Combining ggplot objects
)
# Function to check, install, and load libraries
load_library <- function(package) {
if (!requireNamespace(package, quietly = TRUE)) { # Check if the package is installed
cat(sprintf("Installing missing library: %s\n", package))
install.packages(package, dependencies = TRUE) # Install the package if missing
}
library(package, character.only = TRUE) # Load the package
}
# Apply the function to all required libraries
invisible(lapply(required_libraries, load_library))
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: lattice
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
## The following objects are masked from 'package:dplyr':
##
## between, first, last
cat("All required libraries are loaded.\n")
## All required libraries are loaded.
# --- Step 1: Load Dataset ---
cat("\n==== Step 1: Load Dataset ====\n")
##
## ==== Step 1: Load Dataset ====
data <- tryCatch(
{
read.csv("car_price.csv")
},
error = function(e) {
stop("Error loading dataset: ", e$message)
}
)
cat("Dataset loaded successfully.\n")
## Dataset loaded successfully.
# --- Step 2: Overview of Loaded Dataset ---
cat("\n==== Step 2: Overview of Loaded Dataset ====\n")
##
## ==== Step 2: Overview of Loaded Dataset ====
# Dataset dimensions
cat("\n---- Dataset Overview ----\n")
##
## ---- Dataset Overview ----
cat("Dataset Dimensions (Rows x Columns): ", dim(data), "\n")
## Dataset Dimensions (Rows x Columns): 215 26
cat("Column Names:\n", paste(names(data), collapse = ", "), "\n")
## Column Names:
## car_ID, symboling, CarName, fueltype, aspiration, doornumber, carbody, drivewheel, enginelocation, wheelbase, carlength, carwidth, carheight, curbweight, enginetype, cylindernumber, enginesize, fuelsystem, boreratio, stroke, compressionratio, horsepower, peakrpm, citympg, highwaympg, price
# Calculate and print dataset size in memory
data_size <- object.size(data)
cat("Approximate Data Size in Memory: ", format(data_size, units = "auto"), "\n")
## Approximate Data Size in Memory: 59.4 Kb
# Data types and structure
cat("\n---- Data Types and Structure ----\n")
##
## ---- Data Types and Structure ----
print(str(data))
## 'data.frame': 215 obs. of 26 variables:
## $ car_ID : num 1 2 3 4 5 6 7 8 9 NA ...
## $ symboling : num 3 3 1 2 2 2 1 1 1 NA ...
## $ CarName : chr "alfa-romero giulia" "alfa-romero stelvio" "alfa-romero Quadrifoglio" "audi 100 ls" ...
## $ fueltype : chr "gas" "gas" "gas" "gas" ...
## $ aspiration : chr "std" "std" "std" "std" ...
## $ doornumber : chr "two" "two" "two" "four" ...
## $ carbody : chr "convertible" "convertible" "hatchback" "sedan" ...
## $ drivewheel : chr "rwd" "rwd" "rwd" "fwd" ...
## $ enginelocation : chr "front" "front" "front" "front" ...
## $ wheelbase : num 88.6 88.6 94.5 99.8 99.4 ...
## $ carlength : num 169 169 171 177 177 ...
## $ carwidth : num 64.1 64.1 65.5 66.2 66.4 66.3 71.4 71.4 71.4 NA ...
## $ carheight : num 48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 NA ...
## $ curbweight : num 2548 2548 2823 2337 2824 ...
## $ enginetype : chr "dohc" "dohc" "ohcv" "ohc" ...
## $ cylindernumber : chr "four" "four" "six" "four" ...
## $ enginesize : num 130 130 152 109 136 136 136 136 131 NA ...
## $ fuelsystem : chr "mpfi" "mpfi" "mpfi" "mpfi" ...
## $ boreratio : num 3.47 3.47 2.68 3.19 3.19 3.19 3.19 3.19 3.13 NA ...
## $ stroke : num 2.68 2.68 3.47 3.4 3.4 3.4 3.4 3.4 3.4 NA ...
## $ compressionratio: num 9 9 9 10 8 8.5 8.5 8.5 8.3 NA ...
## $ horsepower : num 111 111 154 102 115 110 110 110 140 NaN ...
## $ peakrpm : num 5000 5000 5000 5500 5500 5500 5500 5500 5500 NA ...
## $ citympg : num 21 21 19 24 18 19 19 19 17 NA ...
## $ highwaympg : num 27 27 26 30 22 25 25 25 20 NA ...
## $ price : num 13495 16500 16500 13950 17450 ...
## NULL
# --- Step 3: Convert Data Types ---
cat("\n==== Step 3: Convert IDs and Categorical Columns to Appropriate Types ====\n")
##
## ==== Step 3: Convert IDs and Categorical Columns to Appropriate Types ====
data$car_ID <- as.factor(data$car_ID) # Assuming car_ID is an identifier
data$fueltype <- as.factor(data$fueltype)
data$aspiration <- as.factor(data$aspiration)
data$doornumber <- as.factor(data$doornumber)
data$carbody <- as.factor(data$carbody)
data$drivewheel <- as.factor(data$drivewheel)
data$enginelocation <- as.factor(data$enginelocation)
data$enginetype <- as.factor(data$enginetype)
data$cylindernumber <- as.factor(data$cylindernumber)
data$fuelsystem <- as.factor(data$fuelsystem)
cat("Data types converted successfully.\n")
## Data types converted successfully.
# --- Step 4: Profile the Dataset ---
cat("\n==== Step 4: Profile the Dataset ====\n")
##
## ==== Step 4: Profile the Dataset ====
# Summary statistics
cat("\n---- Summary Statistics ----\n")
##
## ---- Summary Statistics ----
numeric_cols <- setdiff(names(data)[sapply(data, is.numeric)], "car_ID") # Exclude car_ID
categorical_cols <- setdiff(names(data)[sapply(data, is.factor)], "car_ID") # Exclude car_ID
cat("\n--- Summary Statistics for Numeric Columns ---\n")
##
## --- Summary Statistics for Numeric Columns ---
print(summary(data[numeric_cols]))
## symboling wheelbase carlength carwidth
## Min. :-2.0000 Min. : 86.60 Min. :144.6 Min. :61.80
## 1st Qu.: 0.0000 1st Qu.: 94.50 1st Qu.:166.3 1st Qu.:64.00
## Median : 1.0000 Median : 96.95 Median :173.2 Median :65.50
## Mean : 0.8595 Mean : 98.77 Mean :174.0 Mean :65.92
## 3rd Qu.: 2.0000 3rd Qu.:102.40 3rd Qu.:183.5 3rd Qu.:66.90
## Max. : 3.0000 Max. :115.60 Max. :202.6 Max. :72.30
## NA's :30 NA's :33 NA's :30 NA's :30
## carheight curbweight enginesize boreratio stroke
## Min. :47.8 Min. :1713 Min. :-500.0 Min. :2.540 Min. :2.070
## 1st Qu.:51.6 1st Qu.:2145 1st Qu.: 94.5 1st Qu.:3.150 1st Qu.:3.110
## Median :54.1 Median :2420 Median : 110.0 Median :3.310 Median :3.270
## Mean :53.7 Mean :2556 Mean : 99.9 Mean :3.325 Mean :3.255
## 3rd Qu.:55.5 3rd Qu.:2952 3rd Qu.: 143.0 3rd Qu.:3.580 3rd Qu.:3.410
## Max. :59.8 Max. :4066 Max. :2000.0 Max. :3.940 Max. :4.170
## NA's :30 NA's :30 NA's :28 NA's :32 NA's :30
## compressionratio horsepower peakrpm citympg
## Min. : 7.000 Min. :-50.00 Min. :4150 Min. :13.00
## 1st Qu.: 8.675 1st Qu.: 69.00 1st Qu.:4800 1st Qu.:19.00
## Median : 9.000 Median : 92.00 Median :5200 Median :24.00
## Mean : 67.985 Mean : 95.95 Mean :5142 Mean :25.15
## 3rd Qu.: 9.432 3rd Qu.:116.00 3rd Qu.:5500 3rd Qu.:30.00
## Max. :1000.000 Max. :288.00 Max. :6600 Max. :49.00
## NA's :27 NA's :27 NA's :30 NA's :30
## highwaympg price
## Min. :16.00 Min. :-1000
## 1st Qu.:25.00 1st Qu.: 7336
## Median :30.00 Median : 9984
## Mean :30.63 Mean :12451
## 3rd Qu.:34.00 3rd Qu.:16448
## Max. :54.00 Max. :45400
## NA's :30 NA's :27
cat("\n--- Summary Statistics for Categorical Columns ---\n")
##
## --- Summary Statistics for Categorical Columns ---
print(summary(data[categorical_cols]))
## fueltype aspiration doornumber carbody drivewheel
## : 27 : 30 : 30 :27 : 30
## diesel : 16 std :151 four:102 convertible: 6 4wd: 8
## gas :161 turbo: 34 two : 83 flyingcar :11 fwd:106
## unknown: 11 hardtop : 7 rwd: 71
## hatchback :59
## sedan :82
## wagon :23
## enginelocation enginetype cylindernumber fuelsystem
## : 30 ohc :132 : 30 mpfi :85
## front:182 : 30 eight : 4 2bbl :59
## rear : 3 ohcf : 13 five : 8 :30
## dohc : 12 four :145 idi :17
## ohcv : 12 six : 23 1bbl :11
## l : 11 twelve: 1 spdi : 8
## (Other): 5 two : 4 (Other): 5
# --- Step 5: Visualize Numeric Variables ---
cat("\n==== Step 5: Visualize Numeric Variables ====\n")
##
## ==== Step 5: Visualize Numeric Variables ====
plot_numeric_variable <- function(data, variable) {
cat(glue::glue("\nDisplaying histogram for numeric variable: {variable}\n"))
# Histogram
print(
ggplot(data, aes_string(x = variable)) +
geom_histogram(fill = "lightblue", color = "black", bins = 30) +
labs(title = paste("Histogram for", variable), x = variable, y = "Frequency") +
theme_minimal()
)
}
if (length(numeric_cols) > 0) {
lapply(numeric_cols, function(col) plot_numeric_variable(data, col))
}
## Displaying histogram for numeric variable: symboling
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: wheelbase
## Warning: Removed 33 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: carlength
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: carwidth
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: carheight
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: curbweight
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: enginesize
## Warning: Removed 28 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: boreratio
## Warning: Removed 32 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: stroke
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: compressionratio
## Warning: Removed 27 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: horsepower
## Warning: Removed 27 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: peakrpm
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: citympg
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: highwaympg
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: price
## Warning: Removed 27 rows containing non-finite outside the scale range
## (`stat_bin()`).

## [[1]]
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[2]]
## Warning: Removed 33 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[3]]
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[4]]
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[5]]
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[6]]
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[7]]
## Warning: Removed 28 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[8]]
## Warning: Removed 32 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[9]]
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[10]]
## Warning: Removed 27 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[11]]
## Warning: Removed 27 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[12]]
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[13]]
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[14]]
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[15]]
## Warning: Removed 27 rows containing non-finite outside the scale range
## (`stat_bin()`).

# --- Step 6: Visualize Categorical Variables ---
cat("\n==== Step 6: Visualize Categorical Variables ====\n")
##
## ==== Step 6: Visualize Categorical Variables ====
plot_categorical_variable <- function(data, variable) {
cat(glue::glue("\nDisplaying barplot for categorical variable: {variable}\n"))
print(
ggplot(data, aes_string(x = variable)) +
geom_bar(fill = "skyblue", color = "darkblue") +
labs(title = paste("Barplot for", variable), x = variable, y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
}
if (length(categorical_cols) > 0) {
lapply(categorical_cols, function(col) plot_categorical_variable(data, col))
}
## Displaying barplot for categorical variable: fueltype

## Displaying barplot for categorical variable: aspiration

## Displaying barplot for categorical variable: doornumber

## Displaying barplot for categorical variable: carbody

## Displaying barplot for categorical variable: drivewheel

## Displaying barplot for categorical variable: enginelocation

## Displaying barplot for categorical variable: enginetype

## Displaying barplot for categorical variable: cylindernumber

## Displaying barplot for categorical variable: fuelsystem

## [[1]]

##
## [[2]]

##
## [[3]]

##
## [[4]]

##
## [[5]]

##
## [[6]]

##
## [[7]]

##
## [[8]]

##
## [[9]]

# --- Reflection on Dataset Profiling ---
cat("\n==== Reflection on Dataset Profiling ====\n")
##
## ==== Reflection on Dataset Profiling ====
cat(glue("
1. **Dataset Size and Variables**:
- Rows: {dim(data)[1]}, Columns: {dim(data)[2]}.
- Approximate memory size: {format(data_size, units = 'auto')}.
2. **Variable Visualizations**:
- Histograms displayed for numeric variables (excluding car_ID).
- Bar plots displayed for categorical variables (excluding car_ID).
Profiling helps identify preprocessing needs, such as outlier handling, scaling, and encoding categorical features.
"))
## 1. **Dataset Size and Variables**:
## - Rows: 215, Columns: 26.
## - Approximate memory size: 59.4 Kb.
##
## 2. **Variable Visualizations**:
## - Histograms displayed for numeric variables (excluding car_ID).
## - Bar plots displayed for categorical variables (excluding car_ID).
##
## Profiling helps identify preprocessing needs, such as outlier handling, scaling, and encoding categorical features.
# === Data Preparation ===
# --- Step 1: Missing Values ---
# --- Sub-step 1.1: Identify Missing Values ---
cat("\n---- Sub-step 1.1: Identifying Missing Values ----\n")
##
## ---- Sub-step 1.1: Identifying Missing Values ----
# Calculate the number of missing values in each column
missing_values <- colSums(is.na(data))
cat("Missing values in each column:\n")
## Missing values in each column:
print(missing_values)
## car_ID symboling CarName fueltype
## 30 30 0 0
## aspiration doornumber carbody drivewheel
## 0 0 0 0
## enginelocation wheelbase carlength carwidth
## 0 33 30 30
## carheight curbweight enginetype cylindernumber
## 30 30 0 0
## enginesize fuelsystem boreratio stroke
## 28 0 32 30
## compressionratio horsepower peakrpm citympg
## 27 27 30 30
## highwaympg price
## 30 27
# Visualize Missing Values Before Handling
cat("\nVisualizing missing values (before handling)...\n")
##
## Visualizing missing values (before handling)...
missing_df <- data.frame(Column = names(missing_values), MissingCount = missing_values)
missing_df <- missing_df[missing_df$MissingCount > 0, ] # Filter columns with missing values
if (nrow(missing_df) > 0) {
# Bar plot for missing values before handling
missing_plot_before <- ggplot(missing_df, aes(x = reorder(Column, -MissingCount), y = MissingCount)) +
geom_bar(stat = "identity", fill = "skyblue", color = "darkblue") +
labs(
title = "Missing Values Per Column (Before Handling)",
x = "Column",
y = "Number of Missing Values"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(missing_plot_before)
} else {
cat("No missing values found in the dataset.\n")
}

# --- Sub-step 1.2: Handle Missing Values in car_ID ---
cat("\n---- Sub-step 1.2: Handling Missing Values in car_ID ----\n")
##
## ---- Sub-step 1.2: Handling Missing Values in car_ID ----
# car_ID is a unique identifier, so rows with missing car_ID values should be removed.
if (anyNA(data$car_ID)) {
cat("Handling missing values in car_ID: Removing rows with missing car_ID values.\n")
data <- data[!is.na(data$car_ID), ] # Remove rows where car_ID is NA
cat("Rows with missing car_ID values removed.\n")
} else {
cat("No missing values found in car_ID.\n")
}
## Handling missing values in car_ID: Removing rows with missing car_ID values.
## Rows with missing car_ID values removed.
# --- Sub-step 1.3: Handle Numeric Missing Values (Median Imputation) ---
cat("\n---- Sub-step 1.3: Handling Numeric Missing Values ----\n")
##
## ---- Sub-step 1.3: Handling Numeric Missing Values ----
# Identify numeric columns
numeric_cols <- names(data)[sapply(data, is.numeric)]
# Loop through each numeric column and fill missing values with the median
for (col in numeric_cols) {
if (sum(is.na(data[[col]])) > 0) {
median_val <- median(data[[col]], na.rm = TRUE) # Calculate the median excluding NAs
data[[col]][is.na(data[[col]])] <- median_val # Replace missing values with the median
cat(sprintf("Filled missing values in numeric column '%s' with median: %.2f\n", col, median_val))
}
}
## Filled missing values in numeric column 'wheelbase' with median: 96.95
## Filled missing values in numeric column 'enginesize' with median: 110.00
## Filled missing values in numeric column 'boreratio' with median: 3.31
# --- Sub-step 1.4: Handle Categorical Missing Values (Mode Imputation) ---
cat("\n---- Sub-step 1.4: Handling Categorical Missing Values ----\n")
##
## ---- Sub-step 1.4: Handling Categorical Missing Values ----
# Identify categorical columns (character or factor)
categorical_cols <- names(data)[sapply(data, function(x) is.character(x) || is.factor(x))]
# Loop through each categorical column and fill missing values with the mode
for (col in categorical_cols) {
if (sum(is.na(data[[col]])) > 0) {
mode_val <- names(which.max(table(data[[col]], useNA = "no"))) # Find the mode
data[[col]][is.na(data[[col]])] <- mode_val # Replace missing values with the mode
cat(sprintf("Filled missing values in categorical column '%s' with mode: '%s'\n", col, mode_val))
}
}
# --- Sub-step 1.5: Verify and Visualize Post-Imputation ---
cat("\n---- Sub-step 1.5: Verifying Post-Imputation ----\n")
##
## ---- Sub-step 1.5: Verifying Post-Imputation ----
# Recalculate the number of missing values in each column
final_missing_values <- colSums(is.na(data))
cat("Remaining missing values in each column (should be 0):\n")
## Remaining missing values in each column (should be 0):
print(final_missing_values)
## car_ID symboling CarName fueltype
## 0 0 0 0
## aspiration doornumber carbody drivewheel
## 0 0 0 0
## enginelocation wheelbase carlength carwidth
## 0 0 0 0
## carheight curbweight enginetype cylindernumber
## 0 0 0 0
## enginesize fuelsystem boreratio stroke
## 0 0 0 0
## compressionratio horsepower peakrpm citympg
## 0 0 0 0
## highwaympg price
## 0 0
# Visualize Missing Values After Handling
cat("\nVisualizing missing values (after handling)...\n")
##
## Visualizing missing values (after handling)...
final_missing_df <- data.frame(Column = names(final_missing_values), MissingCount = final_missing_values)
final_missing_df <- final_missing_df[final_missing_df$MissingCount > 0, ]
if (nrow(final_missing_df) > 0) {
# Bar plot for missing values after handling
missing_plot_after <- ggplot(final_missing_df, aes(x = reorder(Column, -MissingCount), y = MissingCount)) +
geom_bar(stat = "identity", fill = "lightgreen", color = "darkgreen") +
labs(
title = "Missing Values Per Column (After Handling)",
x = "Column",
y = "Number of Missing Values"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(missing_plot_after)
} else {
cat("All missing values have been successfully handled.\n")
}
## All missing values have been successfully handled.
# --- Reflection on Step 1: Missing Values ---
cat("\n==== Reflection on Step 1: Missing Values ====\n")
##
## ==== Reflection on Step 1: Missing Values ====
cat(
"1. **Identification**:\n",
"- Missing values were identified and visualized, allowing us to assess the extent of the issue.\n\n",
"2. **Imputation Strategies**:\n",
"- `car_ID`: Rows with missing values were removed as it is a unique identifier.\n",
"- Numeric columns: Median imputation was used to ensure robustness to outliers.\n",
"- Categorical columns: Mode imputation preserved the most common category.\n\n",
"3. **Post-Imputation Analysis**:\n",
"- All columns were rechecked to ensure no residual missing values.\n",
"- Visualizations confirmed the success of the missing value handling process.\n\n",
"By addressing missing values systematically, the dataset is now complete and ready for further preprocessing or modeling.\n"
)
## 1. **Identification**:
## - Missing values were identified and visualized, allowing us to assess the extent of the issue.
##
## 2. **Imputation Strategies**:
## - `car_ID`: Rows with missing values were removed as it is a unique identifier.
## - Numeric columns: Median imputation was used to ensure robustness to outliers.
## - Categorical columns: Mode imputation preserved the most common category.
##
## 3. **Post-Imputation Analysis**:
## - All columns were rechecked to ensure no residual missing values.
## - Visualizations confirmed the success of the missing value handling process.
##
## By addressing missing values systematically, the dataset is now complete and ready for further preprocessing or modeling.
# --- Step 2: Handle Duplicates ---
cat("\n==== Step 2: Handle Duplicates ====\n")
##
## ==== Step 2: Handle Duplicates ====
# Check for and remove duplicate rows
num_duplicates <- nrow(data) - nrow(dplyr::distinct(data))
cat(sprintf("Number of duplicate rows: %d\n", num_duplicates))
## Number of duplicate rows: 0
# Remove duplicates
data <- dplyr::distinct(data)
cat(sprintf("Number of rows after removing duplicates: %d\n", nrow(data)))
## Number of rows after removing duplicates: 185
# --- Reflection on Step 2: Handling Duplicates ---
cat("\n==== Reflection on Step 2: Handling Duplicates ====\n")
##
## ==== Reflection on Step 2: Handling Duplicates ====
cat(
"1. **Initial Check**:\n",
"- Identified and reported the number of duplicate rows: ", num_duplicates, ".\n\n",
"2. **Handling**:\n",
"- Removed all duplicate rows using `dplyr::distinct()`.\n",
"- Remaining rows after duplicate removal: ", nrow(data), ".\n\n",
"By addressing duplicates, the dataset is now free of redundant rows, ensuring consistency and accuracy for subsequent preprocessing.\n"
)
## 1. **Initial Check**:
## - Identified and reported the number of duplicate rows: 0 .
##
## 2. **Handling**:
## - Removed all duplicate rows using `dplyr::distinct()`.
## - Remaining rows after duplicate removal: 185 .
##
## By addressing duplicates, the dataset is now free of redundant rows, ensuring consistency and accuracy for subsequent preprocessing.
# --- Step 3: Handle Outliers ---
cat("\n==== Step 3: Handle Outliers ====\n")
##
## ==== Step 3: Handle Outliers ====
# --- Sub-step 3.1: Identify Outliers ---
cat("\n---- Sub-step 3.1: Identifying Outliers in Numerical Columns ----\n")
##
## ---- Sub-step 3.1: Identifying Outliers in Numerical Columns ----
# Function to calculate IQR bounds
identify_outliers <- function(column) {
Q1 <- quantile(column, 0.25, na.rm = TRUE)
Q3 <- quantile(column, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
list(lower = Q1 - 1.5 * IQR, upper = Q3 + 1.5 * IQR)
}
# Identify numeric columns
numeric_cols <- names(data)[sapply(data, is.numeric)]
# Store original data for comparison
data_original <- data
# --- Sub-step 3.2: Handle Outliers Using Capping ---
cat("\n---- Sub-step 3.2: Handling Outliers Using Capping ----\n")
##
## ---- Sub-step 3.2: Handling Outliers Using Capping ----
# Loop through numeric columns to cap outliers
for (col in numeric_cols) {
bounds <- identify_outliers(data[[col]])
outliers <- which(data[[col]] < bounds$lower | data[[col]] > bounds$upper)
if (length(outliers) > 0) {
data[[col]][outliers] <- pmin(pmax(data[[col]][outliers], bounds$lower), bounds$upper)
cat(sprintf("Column: %-15s | Outliers Adjusted: %-4d | Bounds: [%.2f, %.2f]\n",
col, length(outliers), bounds$lower, bounds$upper))
} else {
cat(sprintf("Column: %-15s | Outliers: None\n", col))
}
}
## Column: symboling | Outliers: None
## Column: wheelbase | Outliers Adjusted: 2 | Bounds: [82.65, 114.25]
## Column: carlength | Outliers: None
## Column: carwidth | Outliers Adjusted: 7 | Bounds: [59.65, 71.25]
## Column: carheight | Outliers: None
## Column: curbweight | Outliers: None
## Column: enginesize | Outliers Adjusted: 15 | Bounds: [25.00, 217.00]
## Column: boreratio | Outliers: None
## Column: stroke | Outliers Adjusted: 18 | Bounds: [2.66, 3.86]
## Column: compressionratio | Outliers Adjusted: 30 | Bounds: [7.40, 10.60]
## Column: horsepower | Outliers Adjusted: 14 | Bounds: [1.00, 185.00]
## Column: peakrpm | Outliers Adjusted: 2 | Bounds: [3750.00, 6550.00]
## Column: citympg | Outliers Adjusted: 1 | Bounds: [2.50, 46.50]
## Column: highwaympg | Outliers Adjusted: 2 | Bounds: [11.50, 47.50]
## Column: price | Outliers Adjusted: 13 | Bounds: [-6092.50, 30055.50]
# --- Sub-step 3.3: Compare Before and After Handling ---
cat("\n---- Sub-step 3.3: Comparing Before and After Outlier Handling ----\n")
##
## ---- Sub-step 3.3: Comparing Before and After Outlier Handling ----
for (col in numeric_cols) {
plot <- ggplot() +
geom_boxplot(data = data_original, aes(x = "Original Data", y = .data[[col]]),
outlier.colour = "red", fill = "lightblue", color = "darkblue") +
geom_boxplot(data = data, aes(x = "After Capping", y = .data[[col]]),
outlier.colour = "red", fill = "lightgreen", color = "darkblue") +
labs(title = paste("Comparison of", col, "- Before and After Outlier Handling"), x = "", y = col) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14)) +
scale_y_continuous(labels = scales::comma)
print(plot)
}















# --- Why Use Capping for Predicting Car Prices? ---
cat("\n---- Why Use Capping for Predicting Car Prices? ----\n")
##
## ---- Why Use Capping for Predicting Car Prices? ----
cat(
"Capping reduces the impact of outliers while retaining rare but valid cases (e.g., luxury cars),\n",
"ensuring the model learns from all data without losing critical information.\n"
)
## Capping reduces the impact of outliers while retaining rare but valid cases (e.g., luxury cars),
## ensuring the model learns from all data without losing critical information.
# --- Reflection on Step 3: Handling Outliers ---
cat("\n==== Reflection on Step 3: Handling Outliers ====\n")
##
## ==== Reflection on Step 3: Handling Outliers ====
cat(
"1. Outliers were detected using the IQR method and visualized with boxplots.\n",
"2. Capping limited extreme values' influence while preserving data integrity.\n",
"3. Boxplots showed the effectiveness of capping in reducing outlier impact.\n",
"This ensures the dataset is robust and ready for analysis.\n"
)
## 1. Outliers were detected using the IQR method and visualized with boxplots.
## 2. Capping limited extreme values' influence while preserving data integrity.
## 3. Boxplots showed the effectiveness of capping in reducing outlier impact.
## This ensures the dataset is robust and ready for analysis.
# --- Step 4: Scaling / Normalizing Features ---
#
# Scaling involves normalizing numeric features to a consistent range, often between 0 and 1.
# This step is critical for machine learning algorithms sensitive to feature magnitude, such as:
# - Gradient Descent-Based Models (e.g., Linear or Logistic Regression)
# - K-Nearest Neighbors (KNN)
# - Support Vector Machines (SVM)
# - Neural Networks
#
# Min-Max Scaling transforms each feature using the formula:
# Scaled Value = (Value - Min) / (Max - Min)
# This ensures that all features contribute equally during model training.
#
#
# # Identify numeric columns
# numeric_cols <- names(data)[sapply(data, is.numeric)]
#
# # Apply Min-Max Scaling to Numeric Columns
# cat("\n---- Scaling Numeric Columns ----\n")
# scaled_data <- data # Create a copy of the dataset for scaling
# scaled_data[numeric_cols] <- lapply(data[numeric_cols], function(col) {
# scaled_col <- (col - min(col, na.rm = TRUE)) / (max(col, na.rm = TRUE) - min(col, na.rm = TRUE))
# return(scaled_col)
# })
# cat("Features scaled using Min-Max scaling.\n")
#
# # Optional: Visualize Original vs. Scaled Data
# cat("\n---- Visualizing Original vs. Scaled Data (Optional) ----\n")
# for (col in numeric_cols) {
# plot <- ggplot() +
# # Original data distribution
# geom_density(data = data, aes_string(x = col), fill = "lightblue", alpha = 0.5, color = "darkblue") +
# # Scaled data distribution
# geom_density(data = scaled_data, aes_string(x = col), fill = "lightgreen", alpha = 0.5, color = "darkgreen") +
# labs(
# title = paste("Comparison of", col, "- Original vs. Scaled"),
# x = col,
# y = "Density"
# ) +
# theme_minimal() +
# theme(
# plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
# )
# print(plot)
# }
# --- Reflection on Step 4: Scaling / Normalizing Features ---
cat("\n==== Reflection on Step 4: Scaling / Normalizing Features ====\n")
##
## ==== Reflection on Step 4: Scaling / Normalizing Features ====
cat(
"1. Scaling was skipped because the selected machine learning algorithms (e.g., Random Forest, Gradient Boosting)\n",
" are tree-based models that do not rely on feature scaling.\n",
"2. These models split data based on thresholds, making scaling unnecessary.\n",
"3. Skipping scaling ensures computational efficiency without compromising model performance.\n",
"4. If future models require distance-based methods (e.g., KNN or SVM), scaling can be revisited.\n"
)
## 1. Scaling was skipped because the selected machine learning algorithms (e.g., Random Forest, Gradient Boosting)
## are tree-based models that do not rely on feature scaling.
## 2. These models split data based on thresholds, making scaling unnecessary.
## 3. Skipping scaling ensures computational efficiency without compromising model performance.
## 4. If future models require distance-based methods (e.g., KNN or SVM), scaling can be revisited.
# --- Step 5: Data Transformation ---
cat("\n==== Step 5: Data Transformation ====\n")
##
## ==== Step 5: Data Transformation ====
# Identify numeric columns
numeric_cols <- names(data)[sapply(data, is.numeric)]
# Function to calculate skewness
library(moments) # Ensure the 'moments' package is installed
calculate_skewness <- function(column) {
skewness(column, na.rm = TRUE)
}
# Apply Log Transformation to Highly Skewed Numeric Columns
cat("\n---- Applying Log Transformation to Highly Skewed Columns ----\n")
##
## ---- Applying Log Transformation to Highly Skewed Columns ----
for (col in numeric_cols) {
skew_val <- calculate_skewness(data[[col]]) # Calculate skewness
cat(sprintf("Skewness of '%s': %.2f\n", col, skew_val))
if (skew_val > 1 && min(data[[col]], na.rm = TRUE) > 0) { # Apply if skewness > 1 and values are positive
data[[paste0(col, "_log")]] <- log(data[[col]])
cat(sprintf("Applied log transformation to '%s' (Skewness: %.2f).\n", col, skew_val))
} else {
cat(sprintf("Skipped log transformation for '%s' (Skewness: %.2f).\n", col, skew_val))
}
}
## Skewness of 'symboling': 0.15
## Skipped log transformation for 'symboling' (Skewness: 0.15).
## Skewness of 'wheelbase': 0.91
## Skipped log transformation for 'wheelbase' (Skewness: 0.91).
## Skewness of 'carlength': 0.16
## Skipped log transformation for 'carlength' (Skewness: 0.16).
## Skewness of 'carwidth': 0.88
## Skipped log transformation for 'carwidth' (Skewness: 0.88).
## Skewness of 'carheight': 0.06
## Skipped log transformation for 'carheight' (Skewness: 0.06).
## Skewness of 'curbweight': 0.66
## Skipped log transformation for 'curbweight' (Skewness: 0.66).
## Skewness of 'enginesize': 0.31
## Skipped log transformation for 'enginesize' (Skewness: 0.31).
## Skewness of 'boreratio': 0.02
## Skipped log transformation for 'boreratio' (Skewness: 0.02).
## Skewness of 'stroke': -0.38
## Skipped log transformation for 'stroke' (Skewness: -0.38).
## Skewness of 'compressionratio': 0.03
## Skipped log transformation for 'compressionratio' (Skewness: 0.03).
## Skewness of 'horsepower': 0.15
## Skipped log transformation for 'horsepower' (Skewness: 0.15).
## Skewness of 'peakrpm': 0.03
## Skipped log transformation for 'peakrpm' (Skewness: 0.03).
## Skewness of 'citympg': 0.54
## Skipped log transformation for 'citympg' (Skewness: 0.54).
## Skewness of 'highwaympg': 0.31
## Skipped log transformation for 'highwaympg' (Skewness: 0.31).
## Skewness of 'price': 0.92
## Skipped log transformation for 'price' (Skewness: 0.92).
# --- Visualize Original vs. Log Transformed Data ---
cat("\n---- Visualizing Original vs. Log Transformed Data ----\n")
##
## ---- Visualizing Original vs. Log Transformed Data ----
for (col in numeric_cols) {
log_col <- paste0(col, "_log")
if (log_col %in% names(data)) { # Ensure log-transformed column exists
plot <- ggplot() +
geom_density(data = data, aes_string(x = col), fill = "lightblue", alpha = 0.5, color = "darkblue") +
geom_density(data = data, aes_string(x = log_col), fill = "lightgreen", alpha = 0.5, color = "darkgreen") +
labs(
title = paste("Comparison of", col, "- Original vs. Log Transformed"),
x = col,
y = "Density"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
)
print(plot)
}
}
# --- Reflection on Step 5: Data Transformation ---
cat("\n==== Reflection on Step 5: Data Transformation ====\n")
##
## ==== Reflection on Step 5: Data Transformation ====
cat(
"1. Log transformation was applied to numeric columns with high skewness (Skewness > 1) and positive values.\n",
"2. This reduces skewness, stabilizes variance, and improves linear relationships for better linear regression performance.\n",
"3. Columns with low skewness or non-positive values were skipped to avoid unnecessary transformations or errors.\n",
"4. Visualization confirmed the effectiveness of log transformation in normalizing distributions.\n"
)
## 1. Log transformation was applied to numeric columns with high skewness (Skewness > 1) and positive values.
## 2. This reduces skewness, stabilizes variance, and improves linear relationships for better linear regression performance.
## 3. Columns with low skewness or non-positive values were skipped to avoid unnecessary transformations or errors.
## 4. Visualization confirmed the effectiveness of log transformation in normalizing distributions.
# === Step 6: Feature Engineering ===
cat("\n==== Step 6: Feature Engineering ====\n")
##
## ==== Step 6: Feature Engineering ====
# --- Sub-step 6.1: Review Existing Features ---
cat("\n---- Reviewing Existing Features ----\n")
##
## ---- Reviewing Existing Features ----
# Display column names and summary statistics
cat("Current column names:\n")
## Current column names:
print(names(data))
## [1] "car_ID" "symboling" "CarName" "fueltype"
## [5] "aspiration" "doornumber" "carbody" "drivewheel"
## [9] "enginelocation" "wheelbase" "carlength" "carwidth"
## [13] "carheight" "curbweight" "enginetype" "cylindernumber"
## [17] "enginesize" "fuelsystem" "boreratio" "stroke"
## [21] "compressionratio" "horsepower" "peakrpm" "citympg"
## [25] "highwaympg" "price"
cat("\nSummary statistics:\n")
##
## Summary statistics:
print(summary(data))
## car_ID symboling CarName fueltype aspiration
## 1 : 1 Min. :-2.0000 Length:185 : 0 : 0
## 2 : 1 1st Qu.: 0.0000 Class :character diesel : 16 std :151
## 3 : 1 Median : 1.0000 Mode :character gas :161 turbo: 34
## 4 : 1 Mean : 0.8595 unknown: 8
## 5 : 1 3rd Qu.: 2.0000
## 6 : 1 Max. : 3.0000
## (Other):179
## doornumber carbody drivewheel enginelocation wheelbase
## : 0 : 0 : 0 : 0 Min. : 86.60
## four:102 convertible: 6 4wd: 8 front:182 1st Qu.: 94.50
## two : 83 flyingcar : 8 fwd:106 rear : 3 Median : 96.95
## hardtop : 7 rwd: 71 Mean : 98.73
## hatchback :59 3rd Qu.:102.40
## sedan :82 Max. :114.25
## wagon :23
## carlength carwidth carheight curbweight enginetype
## Min. :144.6 Min. :61.80 Min. :47.8 Min. :1713 ohc :132
## 1st Qu.:166.3 1st Qu.:64.00 1st Qu.:51.6 1st Qu.:2145 ohcf : 13
## Median :173.2 Median :65.50 Median :54.1 Median :2420 dohc : 12
## Mean :174.0 Mean :65.90 Mean :53.7 Mean :2556 ohcv : 12
## 3rd Qu.:183.5 3rd Qu.:66.90 3rd Qu.:55.5 3rd Qu.:2952 l : 11
## Max. :202.6 Max. :71.25 Max. :59.8 Max. :4066 rotor : 4
## (Other): 1
## cylindernumber enginesize fuelsystem boreratio stroke
## : 0 Min. : 25.0 mpfi :85 Min. :2.540 Min. :2.660
## eight : 4 1st Qu.: 97.0 2bbl :59 1st Qu.:3.150 1st Qu.:3.110
## five : 8 Median :110.0 idi :17 Median :3.310 Median :3.270
## four :145 Mean :121.1 1bbl :11 Mean :3.325 Mean :3.262
## six : 23 3rd Qu.:145.0 spdi : 8 3rd Qu.:3.580 3rd Qu.:3.410
## twelve: 1 Max. :217.0 4bbl : 3 Max. :3.940 Max. :3.860
## two : 4 (Other): 2
## compressionratio horsepower peakrpm citympg
## Min. : 7.400 Min. : 1.00 Min. :4150 Min. :13.00
## 1st Qu.: 8.600 1st Qu.: 70.00 1st Qu.:4800 1st Qu.:19.00
## Median : 9.000 Median : 95.00 Median :5200 Median :24.00
## Mean : 9.114 Mean : 99.11 Mean :5142 Mean :25.14
## 3rd Qu.: 9.400 3rd Qu.:116.00 3rd Qu.:5500 3rd Qu.:30.00
## Max. :10.600 Max. :185.00 Max. :6550 Max. :46.50
##
## highwaympg price
## Min. :16.00 Min. :-1000
## 1st Qu.:25.00 1st Qu.: 7463
## Median :30.00 Median : 9995
## Mean :30.58 Mean :12277
## 3rd Qu.:34.00 3rd Qu.:16500
## Max. :47.50 Max. :30056
##
# --- Sub-step 6.2: Feature Engineering ---
cat("\n---- Engineering Features ----\n")
##
## ---- Engineering Features ----
# 1. **Interaction Features: Horsepower-to-Weight Ratio**
cat("\nCalculating Horsepower-to-Weight Ratio...\n")
##
## Calculating Horsepower-to-Weight Ratio...
if ("horsepower" %in% names(data) && "curbweight" %in% names(data)) {
data$hp_to_weight <- data$horsepower / data$curbweight
cat("Horsepower-to-weight ratio created.\n")
} else {
cat("Skipped horsepower-to-weight ratio due to missing 'horsepower' or 'curbweight'.\n")
}
## Horsepower-to-weight ratio created.
# 2. **Vehicle Segment Classification (Luxury, Standard, Economy)**
cat("\nClassifying Vehicle Segments...\n")
##
## Classifying Vehicle Segments...
if ("price" %in% names(data)) {
quantiles <- quantile(data$price, probs = c(0.33, 0.66, 1.0), na.rm = TRUE)
data$segment <- cut(
data$price,
breaks = c(-Inf, quantiles[1], quantiles[2], quantiles[3]),
labels = c("Economy", "Standard", "Luxury"),
include.lowest = TRUE
)
cat("Vehicle segments classified into Economy, Standard, and Luxury.\n")
} else {
cat("Skipped vehicle segment classification due to missing 'price'.\n")
}
## Vehicle segments classified into Economy, Standard, and Luxury.
# 3. **Fuel Economy Score**
cat("\nCalculating Fuel Economy Score...\n")
##
## Calculating Fuel Economy Score...
if ("citympg" %in% names(data) && "highwaympg" %in% names(data)) {
data$fuel_economy <- (data$citympg + data$highwaympg) / 2
cat("Fuel economy score created.\n")
} else {
cat("Skipped fuel economy score due to missing 'citympg' or 'highwaympg'.\n")
}
## Fuel economy score created.
# 4. **Brand Value**
cat("\nCreating Brand Value Feature...\n")
##
## Creating Brand Value Feature...
if ("CarName" %in% names(data)) {
data <- data %>%
mutate(CarBrand = tolower(gsub(" .*", "", CarName))) # Extract brand name from CarName
# Standardize brand names
brand_corrections <- c("maxda" = "mazda", "vw" = "volkswagen", "vokswagen" = "volkswagen",
"porcshce" = "porsche", "toyouta" = "toyota")
data$CarBrand <- recode(data$CarBrand, !!!brand_corrections)
# Calculate average price by brand
brand_avg_price <- data %>%
group_by(CarBrand) %>%
summarise(BrandAvgPrice = mean(price, na.rm = TRUE), .groups = "drop")
data <- data %>%
left_join(brand_avg_price, by = "CarBrand")
cat("Brand value feature created.\n")
} else {
cat("Skipped brand value due to missing 'CarName'.\n")
}
## Brand value feature created.
# --- Sub-step 6.3: Visualizing Features ---
cat("\n---- Visualizing New Features ----\n")
##
## ---- Visualizing New Features ----
library(ggplot2)
# 1. Visualize Horsepower-to-Weight Ratio
if ("hp_to_weight" %in% names(data)) {
ggplot(data, aes(x = hp_to_weight, y = price)) +
geom_point(color = "firebrick", alpha = 0.7) +
labs(
title = "Horsepower-to-Weight Ratio vs Price",
x = "Horsepower-to-Weight Ratio",
y = "Price"
) +
theme_minimal() +
theme(plot.title = element_text(color = "darkred", size = 14)) %>%
print()
}
## List of 1
## $ plot.title:List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : chr "darkred"
## ..$ size : num 14
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE

# 2. Visualize Vehicle Segment Distribution
if ("segment" %in% names(data)) {
ggplot(data, aes(x = segment, fill = segment)) +
geom_bar(color = "black", fill = c("#66c2a5", "#fc8d62", "#8da0cb")) +
labs(
title = "Vehicle Segment Distribution",
x = "Segment",
y = "Count"
) +
theme_minimal() +
theme(
plot.title = element_text(color = "darkblue", size = 14),
legend.position = "none"
) %>%
print()
}
## List of 2
## $ legend.position: chr "none"
## $ plot.title :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : chr "darkblue"
## ..$ size : num 14
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE

# 3. Visualize Fuel Economy Score
if ("fuel_economy" %in% names(data)) {
ggplot(data, aes(x = fuel_economy, y = price)) +
geom_point(color = "darkgreen", alpha = 0.7) +
labs(
title = "Fuel Economy Score vs Price",
x = "Fuel Economy Score",
y = "Price"
) +
theme_minimal() +
theme(plot.title = element_text(color = "forestgreen", size = 14)) %>%
print()
}
## List of 1
## $ plot.title:List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : chr "forestgreen"
## ..$ size : num 14
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE

# 4. Visualize Brand Value (Average Price by Brand)
if ("BrandAvgPrice" %in% names(data)) {
ggplot(data, aes(x = reorder(CarBrand, price, median), y = price, fill = CarBrand)) +
geom_boxplot(outlier.colour = "red", outlier.shape = 16, outlier.size = 2, alpha = 0.7) + # Boxplot with outliers highlighted
labs(
title = "Price Distribution by Car Brand",
x = "Car Brand",
y = "Price"
) +
theme_minimal() +
theme(
plot.title = element_text(color = "darkblue", size = 14),
axis.text.x = element_text(angle = 45, hjust = 1)
) +
scale_fill_manual(values = rainbow(length(unique(data$CarBrand)))) # Custom rainbow palette for brands
} else {
cat("BrandAvgPrice or CarBrand is not available for visualization.\n")
}

# --- Reflection on Step 6: Feature Engineering ---
cat("\n==== Reflection on Step 6: Feature Engineering ====\n")
##
## ==== Reflection on Step 6: Feature Engineering ====
cat(
"1. Horsepower-to-Weight Ratio:\n",
" - Captures the relationship between engine power and vehicle weight.\n",
" - Indicates vehicle performance, relevant for pricing decisions.\n",
"2. Vehicle Segments:\n",
" - Classified cars into Economy, Standard, and Luxury based on price quantiles.\n",
" - Adds domain knowledge about market positioning of vehicles.\n",
"3. Fuel Economy Score:\n",
" - Combines city and highway MPG into a single efficiency metric.\n",
" - Provides insights into fuel consumption trends.\n",
"4. Brand Value:\n",
" - Created 'BrandAvgPrice' to reflect consumer perception and reliability for each brand.\n",
"These features ensure linear relationships, interpretability, and improved model performance for linear regression.\n"
)
## 1. Horsepower-to-Weight Ratio:
## - Captures the relationship between engine power and vehicle weight.
## - Indicates vehicle performance, relevant for pricing decisions.
## 2. Vehicle Segments:
## - Classified cars into Economy, Standard, and Luxury based on price quantiles.
## - Adds domain knowledge about market positioning of vehicles.
## 3. Fuel Economy Score:
## - Combines city and highway MPG into a single efficiency metric.
## - Provides insights into fuel consumption trends.
## 4. Brand Value:
## - Created 'BrandAvgPrice' to reflect consumer perception and reliability for each brand.
## These features ensure linear relationships, interpretability, and improved model performance for linear regression.
# === Step 7: Visualizing Features Against Price ===
cat("\n==== Step 7: Visualizing Features Against Price ====\n")
##
## ==== Step 7: Visualizing Features Against Price ====
library(ggplot2)
library(dplyr)
library(scales)
# --- Sub-step 7.1: Boxplots for Categorical Variables Against Price ---
cat("\n---- Boxplots for Categorical Variables Against Price ----\n")
##
## ---- Boxplots for Categorical Variables Against Price ----
# Identify categorical columns
categorical_cols <- names(data)[sapply(data, is.character)]
cat("Categorical Columns Identified:\n")
## Categorical Columns Identified:
print(categorical_cols)
## [1] "CarName" "CarBrand"
# Clean data for categorical boxplots
data_cleaned <- data %>%
filter(!is.na(price)) %>%
filter(!apply(., 1, function(row) any(is.na(row) | row == "")))
if (nrow(data_cleaned) == 0) {
stop("Error: The dataset 'data_cleaned' is empty after removing null or empty values.")
}
# Generate boxplots for categorical variables
for (col in categorical_cols) {
if (col == "CarName") {
# Handle 'CarName': Visualize top 10 most frequent car names
cat(sprintf("\nColumn '%s' has many categories. Visualizing top 10 most frequent.\n", col))
# Get the top 10 most frequent CarNames
top_cars <- data_cleaned %>%
count(CarName) %>%
arrange(desc(n)) %>%
slice(1:10)
# Filter data to include only top 10 CarNames
filtered_data <- data_cleaned %>% filter(CarName %in% top_cars$CarName)
# Create the boxplot
plot <- ggplot(filtered_data, aes(x = reorder(CarName, price, FUN = median), y = price, fill = CarName)) +
geom_boxplot(outlier.colour = "gold", outlier.shape = 16, outlier.size = 3, alpha = 0.8) +
labs(
title = "Price Distribution by Top 10 Most Frequent Car Names",
x = "Car Name",
y = "Price"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 16, color = "purple"),
axis.text.x = element_text(angle = 45, hjust = 1, color = "darkblue"),
axis.text.y = element_text(color = "darkgreen")
) +
scale_y_continuous(labels = scales::comma) +
scale_fill_manual(values = grDevices::rainbow(10)) # Dynamic palette for top 10 categories
# Print the plot
print(plot)
} else {
# Standard boxplots for other categorical variables
num_colors <- length(unique(data_cleaned[[col]]))
plot <- ggplot(data_cleaned, aes_string(x = col, y = "price", fill = col)) +
geom_boxplot(outlier.colour = "gold", outlier.shape = 16, outlier.size = 3, alpha = 0.8) +
labs(
title = paste("Price Distribution by", col),
x = col,
y = "Price"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 16, color = "purple"),
axis.text.x = element_text(angle = 45, hjust = 1, color = "darkblue"),
axis.text.y = element_text(color = "darkgreen")
) +
scale_y_continuous(labels = scales::comma) +
scale_fill_manual(values = grDevices::rainbow(num_colors)) # Dynamic palette for all categories
# Print the plot
print(plot)
}
}
##
## Column 'CarName' has many categories. Visualizing top 10 most frequent.


cat("\n---- Why Boxplots for Categorical Variables? ----\n")
##
## ---- Why Boxplots for Categorical Variables? ----
cat(
"Boxplots help visualize price distributions across categories, highlighting medians, variability, and outliers.
For 'CarName', only the top 10 most frequent categories are visualized for better interpretability.\n"
)
## Boxplots help visualize price distributions across categories, highlighting medians, variability, and outliers.
## For 'CarName', only the top 10 most frequent categories are visualized for better interpretability.
# --- Sub-step 7.2: Scatterplots for Numerical Variables Against Price ---
cat("\n---- Scatterplots for Numerical Variables Against Price ----\n")
##
## ---- Scatterplots for Numerical Variables Against Price ----
# Identify numerical columns
numeric_cols <- names(data_cleaned)[sapply(data_cleaned, is.numeric)]
cat("Numerical Columns Identified:\n")
## Numerical Columns Identified:
print(numeric_cols)
## [1] "symboling" "wheelbase" "carlength" "carwidth"
## [5] "carheight" "curbweight" "enginesize" "boreratio"
## [9] "stroke" "compressionratio" "horsepower" "peakrpm"
## [13] "citympg" "highwaympg" "price" "hp_to_weight"
## [17] "fuel_economy" "BrandAvgPrice"
# Generate scatterplots for numerical variables
for (col in numeric_cols) {
if (col != "price") {
cat(sprintf("\nCreating scatterplot for '%s' vs 'price'.\n", col))
# Dynamically generate a large enough color palette for CarName
num_colors <- length(unique(data_cleaned$CarName))
color_palette <- grDevices::rainbow(num_colors)
plot <- ggplot(data_cleaned, aes_string(x = col, y = "price")) +
geom_point(aes(color = CarName), size = 2.5, alpha = 0.6) + # Adjusted size and transparency for points
geom_smooth(method = "lm", color = "red", se = FALSE, linewidth = 1) + # Trendline for correlation
labs(
title = paste("Relationship Between", col, "and Price"),
x = col,
y = "Price"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 16, color = "darkred"),
axis.text.x = element_text(size = 10, color = "darkblue"),
axis.text.y = element_text(size = 10, color = "darkgreen"),
legend.position = "none" # Remove legend for simplicity
) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(labels = scales::comma) +
scale_color_manual(values = color_palette) # Apply dynamic color palette
# Print the plot
print(plot)
}
}
##
## Creating scatterplot for 'symboling' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'wheelbase' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'carlength' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'carwidth' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'carheight' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'curbweight' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'enginesize' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'boreratio' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'stroke' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'compressionratio' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'horsepower' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'peakrpm' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'citympg' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'highwaympg' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'hp_to_weight' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'fuel_economy' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'BrandAvgPrice' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

cat("\n---- Why Scatterplots for Numerical Variables? ----\n")
##
## ---- Why Scatterplots for Numerical Variables? ----
cat(
"Scatterplots reveal relationships, trends, and correlations between numerical variables and price.
Including a linear trendline helps identify significant predictors for car pricing.\n"
)
## Scatterplots reveal relationships, trends, and correlations between numerical variables and price.
## Including a linear trendline helps identify significant predictors for car pricing.
# === Step 8: Redundant Features and Feature Selection ===
cat("\n==== Step 8: Redundant Features and Feature Selection ====\n")
##
## ==== Step 8: Redundant Features and Feature Selection ====
library(caret) # For findCorrelation function
library(ggplot2) # For heatmap visualization
library(dplyr) # For data manipulation
library(reshape2) # For reshaping correlation matrix
# --- Sub-step 8.1: Check and Remove Redundant Features ---
cat("\n---- Checking and Removing Redundant Features ----\n")
##
## ---- Checking and Removing Redundant Features ----
# Identify and remove explicitly redundant columns
redundant_cols <- c("Car_ID") # Replace with known redundant column names
if (all(redundant_cols %in% names(data))) {
data <- data %>% select(-all_of(redundant_cols))
cat("Removed redundant features:\n")
print(redundant_cols)
} else {
cat("No specified redundant features found.\n")
}
## No specified redundant features found.
# --- Sub-step 8.2: Feature Selection Based on Correlation Matrix ---
cat("\n---- Feature Selection Based on Correlation Matrix ----\n")
##
## ---- Feature Selection Based on Correlation Matrix ----
# Identify numeric columns
numeric_cols <- names(data)[sapply(data, is.numeric)]
cat("Numeric Columns Identified:\n")
## Numeric Columns Identified:
print(numeric_cols)
## [1] "symboling" "wheelbase" "carlength" "carwidth"
## [5] "carheight" "curbweight" "enginesize" "boreratio"
## [9] "stroke" "compressionratio" "horsepower" "peakrpm"
## [13] "citympg" "highwaympg" "price" "hp_to_weight"
## [17] "fuel_economy" "BrandAvgPrice"
# Ensure there are numeric columns
if (length(numeric_cols) == 0) {
stop("Error: No numeric columns found in the dataset.")
}
# Generate the correlation matrix for numeric columns
correlation_matrix <- cor(data[numeric_cols], use = "complete.obs")
# Reshape the correlation matrix for visualization
correlation_melted <- melt(correlation_matrix)
## Warning in melt.default(correlation_matrix): The melt generic in data.table has
## been passed a matrix and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is superseded and is no longer actively
## developed, and this redirection is now deprecated. To continue using melt
## methods from reshape2 while both libraries are attached, e.g. melt.list, you
## can prepend the namespace, i.e. reshape2::melt(correlation_matrix). In the next
## version, this warning will become an error.
colnames(correlation_melted) <- c("Feature1", "Feature2", "Correlation")
# Create a heatmap of the correlation matrix
heatmap_plot <- ggplot(correlation_melted, aes(x = Feature1, y = Feature2, fill = Correlation)) +
geom_tile(color = "white") + # Add gridlines to tiles
scale_fill_gradient2(
low = "blue", high = "red", mid = "white", midpoint = 0,
limit = c(-1, 1), name = "Correlation" # Define color scale limits and legend title
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 10, color = "darkblue"),
axis.text.y = element_text(size = 10, color = "darkblue"),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5, color = "darkred")
) +
labs(
title = "Correlation Matrix of Numeric Features",
x = "",
y = ""
)
# Print the heatmap
print(heatmap_plot)

# Find highly correlated features (correlation > 0.9)
highly_correlated <- findCorrelation(correlation_matrix, cutoff = 0.9, names = TRUE)
# Remove highly correlated features
if (length(highly_correlated) > 0) {
cat("Highly correlated features identified and removed:\n")
print(highly_correlated)
data <- data %>% select(-all_of(highly_correlated))
} else {
cat("No highly correlated features found.\n")
}
## Highly correlated features identified and removed:
## [1] "fuel_economy" "highwaympg"
# --- Reflection on Step 8 ---
cat("\n==== Reflection on Step 8 ====\n")
##
## ==== Reflection on Step 8 ====
cat(
"1. Redundant Features:\n",
" - Explicitly removed columns like 'Car_ID' which do not contribute to predictive modeling.\n\n",
"2. Correlation Matrix:\n",
" - Created a heatmap to visualize correlations among numeric features.\n",
" - Identified features with high correlation (cutoff: >0.9) and removed them to reduce multicollinearity.\n\n",
"By performing this step, the dataset is now more refined and ready for modeling.\n"
)
## 1. Redundant Features:
## - Explicitly removed columns like 'Car_ID' which do not contribute to predictive modeling.
##
## 2. Correlation Matrix:
## - Created a heatmap to visualize correlations among numeric features.
## - Identified features with high correlation (cutoff: >0.9) and removed them to reduce multicollinearity.
##
## By performing this step, the dataset is now more refined and ready for modeling.
# === Step 9: Export Final Dataset ===
cat("\n==== Step 9: Export Final Dataset ====\n")
##
## ==== Step 9: Export Final Dataset ====
# Define the file name for the final dataset
output_file <- "car_prices_preprocessed.csv" # Descriptive name based on the context
# Export the dataset
tryCatch(
{
write.csv(data, output_file, row.names = FALSE)
cat(sprintf("Final dataset successfully exported to '%s'.\n", output_file))
},
error = function(e) {
cat(sprintf("Error exporting the final dataset: %s\n", e$message))
}
)
## Final dataset successfully exported to 'car_prices_preprocessed.csv'.
# --- Reflection on Step 9 ---
cat("\n==== Reflection on Step 9 ====\n")
##
## ==== Reflection on Step 9 ====
cat(
"1. The cleaned and preprocessed dataset has been saved as a CSV file named 'car_prices_preprocessed.csv'.\n",
"2. The file name reflects the dataset's purpose, making it easier to identify and reuse.\n",
"3. This ensures consistency and reproducibility, providing a high-quality dataset ready for modeling or analysis.\n"
)
## 1. The cleaned and preprocessed dataset has been saved as a CSV file named 'car_prices_preprocessed.csv'.
## 2. The file name reflects the dataset's purpose, making it easier to identify and reuse.
## 3. This ensures consistency and reproducibility, providing a high-quality dataset ready for modeling or analysis.
# === Modeling ===
## ==== Step 1: Data Splitting ====
cat("\n==== Step 1: Train-Test-Validation Split ====\n")
##
## ==== Step 1: Train-Test-Validation Split ====
# Ensure 'price' is numeric
data$price <- as.numeric(data$price)
# Split the data into training (70%), testing (15%), and validation (15%)
cat("\nSplitting the data into training, testing, and validation sets...\n")
##
## Splitting the data into training, testing, and validation sets...
set.seed(123) # For reproducibility
train_indices <- createDataPartition(data$price, p = 0.7, list = FALSE)
train_data <- data[train_indices, ]
remaining_data <- data[-train_indices, ]
# Further split remaining data into testing and validation
test_indices <- createDataPartition(remaining_data$price, p = 0.5, list = FALSE)
test_data <- remaining_data[test_indices, ]
validation_data <- remaining_data[-test_indices, ]
cat(sprintf("Training set size: %d rows\n", nrow(train_data)))
## Training set size: 131 rows
cat(sprintf("Testing set size: %d rows\n", nrow(test_data)))
## Testing set size: 28 rows
cat(sprintf("Validation set size: %d rows\n", nrow(validation_data)))
## Validation set size: 26 rows
## ==== Step 2: Data Preprocessing ====
cat("\n==== Step 2: Data Preprocessing ====\n")
##
## ==== Step 2: Data Preprocessing ====
# Remove unnecessary columns (e.g., IDs, textual columns)
excluded_columns <- c("car_ID", "CarName", "CarBrand") # Replace as needed
train_data <- train_data %>% select(-all_of(excluded_columns))
test_data <- test_data %>% select(-all_of(excluded_columns))
validation_data <- validation_data %>% select(-all_of(excluded_columns))
# Convert categorical variables to factors
categorical_vars <- names(train_data)[sapply(train_data, is.character)]
train_data[categorical_vars] <- lapply(train_data[categorical_vars], as.factor)
test_data[categorical_vars] <- lapply(test_data[categorical_vars], as.factor)
validation_data[categorical_vars] <- lapply(validation_data[categorical_vars], as.factor)
# Create dummy variables for categorical features
cat("\nCreating dummy variables for categorical features...\n")
##
## Creating dummy variables for categorical features...
dummies <- dummyVars(" ~ .", data = train_data)
train_data <- data.frame(predict(dummies, newdata = train_data))
test_data <- data.frame(predict(dummies, newdata = test_data))
validation_data <- data.frame(predict(dummies, newdata = validation_data))
# Ensure train, test, and validation datasets have the same columns
missing_cols <- setdiff(names(train_data), names(test_data))
for (col in missing_cols) {
test_data[[col]] <- 0
validation_data[[col]] <- 0
}
test_data <- test_data[, names(train_data)]
validation_data <- validation_data[, names(train_data)]
## ==== Step 3: Scaling Numeric Features ====
cat("\n==== Step 3: Scaling Numeric Features ====\n")
##
## ==== Step 3: Scaling Numeric Features ====
# Identify numeric columns (excluding the target variable 'price')
numeric_cols <- setdiff(names(train_data)[sapply(train_data, is.numeric)], "price")
# Apply Min-Max Scaling using training data parameters
preProcValues <- preProcess(train_data[, numeric_cols], method = c("range"))
## Warning in preProcess.default(train_data[, numeric_cols], method = c("range")):
## No variation for for: fueltype., aspiration., doornumber., carbody.,
## drivewheel., enginelocation., enginetype., cylindernumber.,
## cylindernumber.twelve, fuelsystem., fuelsystem.spfi
train_data[, numeric_cols] <- predict(preProcValues, train_data[, numeric_cols])
test_data[, numeric_cols] <- predict(preProcValues, test_data[, numeric_cols])
validation_data[, numeric_cols] <- predict(preProcValues, validation_data[, numeric_cols])
cat("Features scaled using Min-Max scaling.\n")
## Features scaled using Min-Max scaling.
# Remove zero or near-zero variance predictors
nzv <- nearZeroVar(train_data)
if (length(nzv) > 0) {
cat("\nRemoving near-zero variance predictors:\n")
print(names(train_data)[nzv])
train_data <- train_data[, -nzv]
test_data <- test_data[, -nzv]
validation_data <- validation_data[, -nzv]
}
##
## Removing near-zero variance predictors:
## [1] "fueltype." "fueltype.unknown" "aspiration."
## [4] "doornumber." "carbody." "carbody.convertible"
## [7] "carbody.flyingcar" "carbody.hardtop" "drivewheel."
## [10] "drivewheel.4wd" "enginelocation." "enginelocation.front"
## [13] "enginelocation.rear" "enginetype." "enginetype.dohcv"
## [16] "enginetype.l" "enginetype.rotor" "cylindernumber."
## [19] "cylindernumber.eight" "cylindernumber.five" "cylindernumber.twelve"
## [22] "cylindernumber.two" "fuelsystem." "fuelsystem.4bbl"
## [25] "fuelsystem.mfi" "fuelsystem.spfi"
## ==== Step 4: Build Linear Regression Model ====
cat("\n==== Step 4: Build Linear Regression Model ====\n")
##
## ==== Step 4: Build Linear Regression Model ====
# Specify formula (price as the dependent variable)
formula <- price ~ .
# Train the linear regression model
linear_model <- lm(formula, data = train_data)
# Display model summary
cat("\nModel Summary:\n")
##
## Model Summary:
print(summary(linear_model))
##
## Call:
## lm(formula = formula, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3648.7 -661.3 -36.0 625.5 3913.8
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5730.81 2788.11 2.055 0.042609 *
## symboling 36.25 844.02 0.043 0.965829
## fueltype.diesel 10756.85 2328.81 4.619 1.22e-05 ***
## fueltype.gas 8262.17 2283.91 3.618 0.000481 ***
## aspiration.std -252.83 570.35 -0.443 0.658573
## aspiration.turbo NA NA NA NA
## doornumber.four 156.78 437.66 0.358 0.720974
## doornumber.two NA NA NA NA
## carbody.hatchback -1791.08 567.19 -3.158 0.002137 **
## carbody.sedan -1333.80 637.13 -2.093 0.039003 *
## carbody.wagon -1732.43 779.31 -2.223 0.028609 *
## drivewheel.fwd 41.32 923.81 0.045 0.964419
## drivewheel.rwd 14.62 1044.87 0.014 0.988866
## wheelbase 27.36 1796.04 0.015 0.987876
## carlength -1588.92 1960.73 -0.810 0.419774
## carwidth 1463.76 1546.88 0.946 0.346439
## carheight -1959.69 1075.46 -1.822 0.071607 .
## curbweight -642.23 4444.20 -0.145 0.885407
## enginetype.dohc -1801.35 856.05 -2.104 0.038023 *
## enginetype.ohc 151.18 621.35 0.243 0.808300
## enginetype.ohcf 1467.67 1108.32 1.324 0.188636
## enginetype.ohcv -3104.89 952.48 -3.260 0.001553 **
## cylindernumber.four -1315.95 719.26 -1.830 0.070482 .
## cylindernumber.six 495.45 845.60 0.586 0.559339
## enginesize 286.08 1999.15 0.143 0.886517
## fuelsystem.1bbl 983.04 1148.98 0.856 0.394406
## fuelsystem.2bbl 504.44 995.29 0.507 0.613463
## fuelsystem.idi NA NA NA NA
## fuelsystem.mpfi -53.93 938.08 -0.057 0.954279
## fuelsystem.spdi -1217.84 1062.05 -1.147 0.254421
## boreratio -795.79 1288.27 -0.618 0.538250
## stroke 32.88 978.40 0.034 0.973261
## compressionratio -1806.80 1080.46 -1.672 0.097801 .
## horsepower 27279.89 7022.80 3.884 0.000191 ***
## peakrpm 931.17 808.91 1.151 0.252593
## citympg 887.18 1780.40 0.498 0.619433
## hp_to_weight -20675.39 7669.38 -2.696 0.008319 **
## segment.Economy -4026.84 607.63 -6.627 2.13e-09 ***
## segment.Standard -3018.43 477.77 -6.318 8.77e-09 ***
## segment.Luxury NA NA NA NA
## BrandAvgPrice 9079.27 867.75 10.463 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1251 on 94 degrees of freedom
## Multiple R-squared: 0.9768, Adjusted R-squared: 0.968
## F-statistic: 110.1 on 36 and 94 DF, p-value: < 2.2e-16
## ==== Step 5: Evaluate Model Performance ====
cat("\n==== Step 5: Evaluate Model Performance ====\n")
##
## ==== Step 5: Evaluate Model Performance ====
# Predict on training, testing, and validation datasets
train_predictions <- predict(linear_model, train_data)
test_predictions <- predict(linear_model, test_data)
validation_predictions <- predict(linear_model, validation_data)
## Warning in predict.lm(linear_model, validation_data): prediction from
## rank-deficient fit; attr(*, "non-estim") has doubtful cases
# Calculate RMSE and R-squared for training data
train_rmse <- sqrt(mean((train_predictions - train_data$price)^2))
train_r2 <- summary(linear_model)$r.squared
cat(sprintf("Training RMSE: %.2f\n", train_rmse))
## Training RMSE: 1059.73
cat(sprintf("Training R-squared: %.2f\n", train_r2))
## Training R-squared: 0.98
# Calculate RMSE and R-squared for testing data
test_rmse <- sqrt(mean((test_predictions - test_data$price)^2))
test_r2 <- cor(test_data$price, test_predictions)^2
cat(sprintf("Testing RMSE: %.2f\n", test_rmse))
## Testing RMSE: 1562.67
cat(sprintf("Testing R-squared: %.2f\n", test_r2))
## Testing R-squared: 0.97
# Calculate RMSE and R-squared for validation data
validation_rmse <- sqrt(mean((validation_predictions - validation_data$price)^2))
validation_r2 <- cor(validation_data$price, validation_predictions)^2
cat(sprintf("Validation RMSE: %.2f\n", validation_rmse))
## Validation RMSE: 1785.84
cat(sprintf("Validation R-squared: %.2f\n", validation_r2))
## Validation R-squared: 0.94
## ==== Step 6: Visualize Model Results ====
cat("\n==== Step 6: Visualize Model Results ====\n")
##
## ==== Step 6: Visualize Model Results ====
# Residual Plot
residuals <- test_data$price - test_predictions
ggplot(data.frame(Predicted = test_predictions, Residuals = residuals), aes(x = Predicted, y = Residuals)) +
geom_point(color = "darkblue", alpha = 0.6) +
geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
labs(
title = "Residual Plot: Predicted vs Residuals",
x = "Predicted Prices",
y = "Residuals"
) +
theme_minimal()

# Actual vs Predicted Prices Plot
ggplot(data.frame(Actual = test_data$price, Predicted = test_predictions), aes(x = Actual, y = Predicted)) +
geom_point(color = "darkgreen", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(
title = "Actual vs Predicted Prices",
x = "Actual Prices",
y = "Predicted Prices"
) +
theme_minimal()

## ==== Step 7: Confusion Matrix Heatmap ====
cat("\n==== Step 7: Confusion Matrix Heatmap ====\n")
##
## ==== Step 7: Confusion Matrix Heatmap ====
# Round predictions and actual prices for classification-like analysis
actual <- round(test_data$price)
predicted <- round(test_predictions)
conf_matrix <- table(Predicted = predicted, Actual = actual)
conf_matrix_df <- as.data.frame(conf_matrix)
ggplot(conf_matrix_df, aes(x = Predicted, y = Actual, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), color = "black") +
scale_fill_gradient(low = "lightblue", high = "blue") +
labs(title = "Confusion Matrix Heatmap", x = "Predicted", y = "Actual") +
theme_minimal()

## ==== Step 8: ROC Curve ====
cat("\n==== Step 8: ROC Curve ====\n")
##
## ==== Step 8: ROC Curve ====
# Bin 'price' into categories
price_bins <- quantile(test_data$price, probs = c(0, 0.33, 0.66, 1), na.rm = TRUE)
test_data$price_bin <- cut(
test_data$price,
breaks = c(-Inf, price_bins[2], price_bins[3], Inf),
labels = c("Low", "Medium", "High"),
include.lowest = TRUE
)
# Generate ROC curve
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc_curve <- multiclass.roc(as.numeric(test_data$price_bin), test_predictions)
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
roc_list <- roc_curve$rocs
plot(roc_list[[1]], col = "red", main = "ROC Curves for Multiclass Linear Regression")
for (i in 2:length(roc_list)) {
plot(roc_list[[i]], add = TRUE, col = rainbow(length(roc_list))[i])
}
legend(
"bottomright",
legend = paste("Class", 1:length(roc_list)),
col = rainbow(length(roc_list)),
lty = 1
)

## ==== Step 9: Precision-Recall Curve ====
cat("\n==== Step 9: Precision-Recall Curve ====\n")
##
## ==== Step 9: Precision-Recall Curve ====
library(PRROC)
pr_curve <- pr.curve(scores.class0 = test_predictions, weights.class0 = as.numeric(test_data$price_bin == "High"), curve = TRUE)
plot(pr_curve, main = "Precision-Recall Curve for High Price Class")

# === Reflection on the Modeling Process ===
#
# 1. **Data Splitting**: The dataset was split into training (70%), testing (15%), and validation (15%), ensuring robust performance evaluation.
#
# 2. **Data Preprocessing**: Features were encoded, scaled, and aligned across datasets, avoiding data leakage and ensuring consistency.
#
# 3. **Linear Regression**: The model revealed key drivers of car pricing and demonstrated reasonable predictive accuracy.
#
# 4. **Performance Evaluation**: Metrics like RMSE and R-squared showed minimal overfitting, with validation results aligning well with testing.
#
# 5. **Visualizations and Metrics**: Residual plots, ROC, and Precision-Recall curves identified areas for improvement, especially for high-price predictions.
#
# **Future Improvements**:
# - Explore non-linear models and feature engineering to capture complex patterns.
# - Refine outlier handling and prediction thresholds for better class differentiation.
# === Modeling and Comparison: Linear Regression vs Random Forest ===
# ==== Step 1: Data Splitting ====
cat("\n==== Step 1: Train-Test-Validation Split ====\n")
##
## ==== Step 1: Train-Test-Validation Split ====
# Ensure 'price' is numeric
data$price <- as.numeric(data$price)
# Split the data into training (70%), testing (15%), and validation (15%)
set.seed(123) # For reproducibility
train_indices <- createDataPartition(data$price, p = 0.7, list = FALSE)
train_data <- data[train_indices, ]
remaining_data <- data[-train_indices, ]
# Further split remaining data into testing and validation
test_indices <- createDataPartition(remaining_data$price, p = 0.5, list = FALSE)
test_data <- remaining_data[test_indices, ]
validation_data <- remaining_data[-test_indices, ]
cat(sprintf("Training set size: %d rows\n", nrow(train_data)))
## Training set size: 131 rows
cat(sprintf("Testing set size: %d rows\n", nrow(test_data)))
## Testing set size: 28 rows
cat(sprintf("Validation set size: %d rows\n", nrow(validation_data)))
## Validation set size: 26 rows
# ==== Step 2: Data Preprocessing ====
cat("\n==== Step 2: Data Preprocessing ====\n")
##
## ==== Step 2: Data Preprocessing ====
# Remove unnecessary columns
excluded_columns <- c("car_ID", "CarName", "CarBrand") # Replace as needed
train_data <- train_data %>% select(-all_of(excluded_columns))
test_data <- test_data %>% select(-all_of(excluded_columns))
validation_data <- validation_data %>% select(-all_of(excluded_columns))
# Convert categorical variables to factors
categorical_vars <- names(train_data)[sapply(train_data, is.character)]
train_data[categorical_vars] <- lapply(train_data[categorical_vars], as.factor)
test_data[categorical_vars] <- lapply(test_data[categorical_vars], as.factor)
validation_data[categorical_vars] <- lapply(validation_data[categorical_vars], as.factor)
# Create dummy variables for categorical features
cat("\nCreating dummy variables for categorical features...\n")
##
## Creating dummy variables for categorical features...
dummies <- dummyVars(" ~ .", data = train_data)
train_data <- data.frame(predict(dummies, newdata = train_data))
test_data <- data.frame(predict(dummies, newdata = test_data))
validation_data <- data.frame(predict(dummies, newdata = validation_data))
# Ensure datasets have the same columns
missing_cols <- setdiff(names(train_data), names(test_data))
for (col in missing_cols) {
test_data[[col]] <- 0
validation_data[[col]] <- 0
}
test_data <- test_data[, names(train_data)]
validation_data <- validation_data[, names(train_data)]
# Scale numeric features
numeric_cols <- setdiff(names(train_data)[sapply(train_data, is.numeric)], "price")
preProcValues <- preProcess(train_data[, numeric_cols], method = c("range"))
## Warning in preProcess.default(train_data[, numeric_cols], method = c("range")):
## No variation for for: fueltype., aspiration., doornumber., carbody.,
## drivewheel., enginelocation., enginetype., cylindernumber.,
## cylindernumber.twelve, fuelsystem., fuelsystem.spfi
train_data[, numeric_cols] <- predict(preProcValues, train_data[, numeric_cols])
test_data[, numeric_cols] <- predict(preProcValues, test_data[, numeric_cols])
validation_data[, numeric_cols] <- predict(preProcValues, validation_data[, numeric_cols])
# Remove near-zero variance predictors
nzv <- nearZeroVar(train_data)
if (length(nzv) > 0) {
train_data <- train_data[, -nzv]
test_data <- test_data[, -nzv]
validation_data <- validation_data[, -nzv]
}
# ==== Step 3: Linear Regression ====
cat("\n==== Step 3: Linear Regression ====\n")
##
## ==== Step 3: Linear Regression ====
linear_model <- lm(price ~ ., data = train_data)
cat("\nLinear Regression Model Summary:\n")
##
## Linear Regression Model Summary:
print(summary(linear_model))
##
## Call:
## lm(formula = price ~ ., data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3648.7 -661.3 -36.0 625.5 3913.8
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5730.81 2788.11 2.055 0.042609 *
## symboling 36.25 844.02 0.043 0.965829
## fueltype.diesel 10756.85 2328.81 4.619 1.22e-05 ***
## fueltype.gas 8262.17 2283.91 3.618 0.000481 ***
## aspiration.std -252.83 570.35 -0.443 0.658573
## aspiration.turbo NA NA NA NA
## doornumber.four 156.78 437.66 0.358 0.720974
## doornumber.two NA NA NA NA
## carbody.hatchback -1791.08 567.19 -3.158 0.002137 **
## carbody.sedan -1333.80 637.13 -2.093 0.039003 *
## carbody.wagon -1732.43 779.31 -2.223 0.028609 *
## drivewheel.fwd 41.32 923.81 0.045 0.964419
## drivewheel.rwd 14.62 1044.87 0.014 0.988866
## wheelbase 27.36 1796.04 0.015 0.987876
## carlength -1588.92 1960.73 -0.810 0.419774
## carwidth 1463.76 1546.88 0.946 0.346439
## carheight -1959.69 1075.46 -1.822 0.071607 .
## curbweight -642.23 4444.20 -0.145 0.885407
## enginetype.dohc -1801.35 856.05 -2.104 0.038023 *
## enginetype.ohc 151.18 621.35 0.243 0.808300
## enginetype.ohcf 1467.67 1108.32 1.324 0.188636
## enginetype.ohcv -3104.89 952.48 -3.260 0.001553 **
## cylindernumber.four -1315.95 719.26 -1.830 0.070482 .
## cylindernumber.six 495.45 845.60 0.586 0.559339
## enginesize 286.08 1999.15 0.143 0.886517
## fuelsystem.1bbl 983.04 1148.98 0.856 0.394406
## fuelsystem.2bbl 504.44 995.29 0.507 0.613463
## fuelsystem.idi NA NA NA NA
## fuelsystem.mpfi -53.93 938.08 -0.057 0.954279
## fuelsystem.spdi -1217.84 1062.05 -1.147 0.254421
## boreratio -795.79 1288.27 -0.618 0.538250
## stroke 32.88 978.40 0.034 0.973261
## compressionratio -1806.80 1080.46 -1.672 0.097801 .
## horsepower 27279.89 7022.80 3.884 0.000191 ***
## peakrpm 931.17 808.91 1.151 0.252593
## citympg 887.18 1780.40 0.498 0.619433
## hp_to_weight -20675.39 7669.38 -2.696 0.008319 **
## segment.Economy -4026.84 607.63 -6.627 2.13e-09 ***
## segment.Standard -3018.43 477.77 -6.318 8.77e-09 ***
## segment.Luxury NA NA NA NA
## BrandAvgPrice 9079.27 867.75 10.463 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1251 on 94 degrees of freedom
## Multiple R-squared: 0.9768, Adjusted R-squared: 0.968
## F-statistic: 110.1 on 36 and 94 DF, p-value: < 2.2e-16
train_pred_lm <- predict(linear_model, train_data)
test_pred_lm <- predict(linear_model, test_data)
validation_pred_lm <- predict(linear_model, validation_data)
## Warning in predict.lm(linear_model, validation_data): prediction from
## rank-deficient fit; attr(*, "non-estim") has doubtful cases
train_rmse_lm <- sqrt(mean((train_pred_lm - train_data$price)^2))
test_rmse_lm <- sqrt(mean((test_pred_lm - test_data$price)^2))
validation_rmse_lm <- sqrt(mean((validation_pred_lm - validation_data$price)^2))
train_r2_lm <- summary(linear_model)$r.squared
test_r2_lm <- cor(test_data$price, test_pred_lm)^2
validation_r2_lm <- cor(validation_data$price, validation_pred_lm)^2
cat(sprintf("Linear Regression RMSE - Train: %.2f, Test: %.2f, Validation: %.2f\n",
train_rmse_lm, test_rmse_lm, validation_rmse_lm))
## Linear Regression RMSE - Train: 1059.73, Test: 1562.67, Validation: 1785.84
# ==== Step 4: Random Forest ====
cat("\n==== Step 4: Random Forest ====\n")
##
## ==== Step 4: Random Forest ====
library(randomForest)
rf_model <- randomForest(price ~ ., data = train_data, ntree = 100, importance = TRUE)
cat("\nRandom Forest Model Summary:\n")
##
## Random Forest Model Summary:
print(rf_model)
##
## Call:
## randomForest(formula = price ~ ., data = train_data, ntree = 100, importance = TRUE)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 13
##
## Mean of squared residuals: 3049425
## % Var explained: 93.71
train_pred_rf <- predict(rf_model, train_data)
test_pred_rf <- predict(rf_model, test_data)
validation_pred_rf <- predict(rf_model, validation_data)
train_rmse_rf <- sqrt(mean((train_pred_rf - train_data$price)^2))
test_rmse_rf <- sqrt(mean((test_pred_rf - test_data$price)^2))
validation_rmse_rf <- sqrt(mean((validation_pred_rf - validation_data$price)^2))
train_r2_rf <- cor(train_data$price, train_pred_rf)^2
test_r2_rf <- cor(test_data$price, test_pred_rf)^2
validation_r2_rf <- cor(validation_data$price, validation_pred_rf)^2
cat(sprintf("Random Forest RMSE - Train: %.2f, Test: %.2f, Validation: %.2f\n",
train_rmse_rf, test_rmse_rf, validation_rmse_rf))
## Random Forest RMSE - Train: 771.33, Test: 2012.99, Validation: 1444.11
# ==== Step 5: Comparison Metrics ====
comparison <- data.frame(
Model = c("Linear Regression", "Random Forest"),
RMSE_Train = c(train_rmse_lm, train_rmse_rf),
RMSE_Test = c(test_rmse_lm, test_rmse_rf),
RMSE_Validation = c(validation_rmse_lm, validation_rmse_rf),
R2_Train = c(train_r2_lm, train_r2_rf),
R2_Test = c(test_r2_lm, test_r2_rf),
R2_Validation = c(validation_r2_lm, validation_r2_rf)
)
cat("\n==== Model Comparison ====\n")
##
## ==== Model Comparison ====
print(comparison)
## Model RMSE_Train RMSE_Test RMSE_Validation R2_Train R2_Test
## 1 Linear Regression 1059.734 1562.667 1785.839 0.9768432 0.9678338
## 2 Random Forest 771.329 2012.991 1444.109 0.9888207 0.9527542
## R2_Validation
## 1 0.9357571
## 2 0.9563989
# ==== Step 6: Professional Visualizations ====
# 1. RMSE and R² Comparison (Bar Plots)
comparison_long <- comparison %>%
pivot_longer(
cols = starts_with("RMSE") | starts_with("R2"),
names_to = "Metric",
values_to = "Value"
) %>%
mutate(
Metric_Type = case_when(
grepl("RMSE", Metric) ~ "RMSE (Error)",
grepl("R2", Metric) ~ "R² (Variance Explained)"
)
)
ggplot(comparison_long, aes(x = Model, y = Value, fill = Model)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~ Metric_Type, scales = "free_y", nrow = 2) +
labs(
title = "Model Comparison: RMSE and R²",
x = "Model",
y = "Value",
fill = "Model"
) +
theme_minimal(base_size = 14) +
scale_fill_manual(values = c("Linear Regression" = "#4E79A7", "Random Forest" = "#F28E2B")) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
strip.text = element_text(face = "bold", size = 12),
legend.position = "top"
) +
geom_text(aes(label = round(Value, 2)), vjust = -0.5, size = 3.5)

# 2. Actual vs. Predicted Prices: Test Data
test_data_preds <- data.frame(
Actual = test_data$price,
Linear_Regression = test_pred_lm,
Random_Forest = test_pred_rf
) %>%
pivot_longer(cols = -Actual, names_to = "Model", values_to = "Predicted")
ggplot(test_data_preds, aes(x = Actual, y = Predicted, color = Model)) +
geom_point(alpha = 0.6, size = 3) +
geom_abline(slope = 1, intercept = 0, color = "black", linetype = "dashed") +
labs(
title = "Actual vs Predicted Prices (Test Data)",
subtitle = "Comparison of Prediction Accuracy for Linear Regression and Random Forest",
x = "Actual Prices",
y = "Predicted Prices",
color = "Model"
) +
theme_minimal(base_size = 14) +
scale_color_manual(values = c("Linear_Regression" = "#4E79A7", "Random_Forest" = "#F28E2B")) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, face = "italic"),
legend.position = "top"
)

# 3. Residual Analysis: Test Data
residuals_data <- data.frame(
Actual = test_data$price,
Linear_Regression_Residuals = test_data$price - test_pred_lm,
Random_Forest_Residuals = test_data$price - test_pred_rf
) %>%
pivot_longer(
cols = starts_with("Linear_Regression") | starts_with("Random_Forest"),
names_to = "Model",
values_to = "Residuals"
)
ggplot(residuals_data, aes(x = Actual, y = Residuals, color = Model)) +
geom_point(alpha = 0.6, size = 3) +
geom_hline(yintercept = 0, color = "black", linetype = "dashed") +
labs(
title = "Residual Analysis: Actual Prices vs. Residuals",
subtitle = "Evaluating Error Distribution Across Models",
x = "Actual Prices",
y = "Residuals",
color = "Model"
) +
theme_minimal(base_size = 14) +
scale_color_manual(values = c("Linear_Regression_Residuals" = "#4E79A7",
"Random_Forest_Residuals" = "#F28E2B")) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, face = "italic"),
legend.position = "top"
)

# 4. Feature Importance: Random Forest
importance_df <- data.frame(
Feature = rownames(importance(rf_model)),
Importance = importance(rf_model)[, 1]
) %>%
arrange(desc(Importance))
ggplot(importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "#76B7B2") +
coord_flip() +
labs(
title = "Feature Importance (Random Forest)",
subtitle = "Top Features Contributing to Car Price Prediction",
x = "Features",
y = "Importance Score"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, face = "italic")
)

# ==== Step 7: Reflection ====
cat("\n==== Reflection ====\n")
##
## ==== Reflection ====
cat("Linear Regression is simpler and interpretable, but its performance is limited with non-linear data.\n")
## Linear Regression is simpler and interpretable, but its performance is limited with non-linear data.
cat("Random Forest provides better accuracy across training, testing, and validation datasets.\n")
## Random Forest provides better accuracy across training, testing, and validation datasets.
cat("However, Random Forest is less interpretable and may require more resources for training.\n")
## However, Random Forest is less interpretable and may require more resources for training.